* ==============================================================================
* Script:  1_data_cleaning.do
* Purpose: a. imports the raw data and saves it in Stata readable format
*		   b. process the main dataset in preparation for analysis
*
* Index
* 1. Bilateral trade data		- Source: BACI using HS96 (publicly available at https://www.cepii.fr/CEPII/en/bdd_modele/bdd_modele_item.asp?id=37)
* 2. NTM Data (SPS and STC)		- Source: Ghodsi et al (2017). The evolution of non-tariff measures and their diverse effects on trade. wiiw Research Report No 419, Vienna Institute for International Economic Studies, May (2017).
* 3. Tariff Data 				- Source: WITS
* 4. Bilateral gravity data 	- Source: CEPII
*   
* ==============================================================================

cd "$translog_gvty/data"
global baci "/Users/ddfiankor/ownCloud/Data/baci/"
global ntm  "/Users/ddfiankor/ownCloud/Data/NTM Ghodsi/"

* 1. BILATERAL TRADE
* ==================
use "$baci/baci96/baci96.dta", clear
  gen hs2 = int(hs6/10000)
  drop if hs2 >24 		// keep only agricultural products	
  order i j hs6 t 
	
preserve 
  bys i t hs2:  gen nhs2  = _n == 1 // We generate two different measures of the extensive margins (n_i in Novy)
  bys i t hs6:  gen nhs6  = _n == 1 
  bys i t    : egen n2_it       = sum(nhs2)
  bys i t    : egen n6_it       = sum(nhs6)
  keep i t n2_it n6_it
  duplicates drop
  save "$translog_gvty/processed/ext_margin", replace 
  
restore
	
foreach c of varlist v q {  // Aggregating the data to the country level (dropping the product dimension) 
	bys i j t: egen  `c'_n = sum(`c')
	}
	drop hs2 hs6 v q 
	duplicates drop 
	
	merge m:1 i using "$baci/baci96/country_code_baci96.dta", keep(3) nogen  // BACI country codes are numbers but the other datasets have names, so we merge country names
	rename name exp
	merge m:1 j using "$baci/baci96/country_code_baci96_j.dta", keep(3) nogen 
	rename name imp
	do countrynames
	drop i j 
	rename (exp imp) (i j)
	
* To avoid dealing with EU expansion, we treat the EU-15 as a group
	gen  byte eu15 = (j=="FRA"|j=="DEU"|j=="ITA"|j=="NLD"|j=="BEL"|j=="BLX"|j=="LUX"|j=="GBR"|j=="DNK"|j=="IRL"|j=="ESP"|j=="PRT"|j=="GRC"|j=="AUT"|j=="SWE"|j=="FIN")
	bys  i t: egen v_eu15 = sum(v) if eu15 == 1
	replace v_n = v_eu15 if j == "DEU"
	drop if (j=="FRA" |j=="ITA" |j=="NLD" |j=="BEL"| j=="BLX" |j=="LUX" |j=="GBR" |j=="DNK" |j=="IRL"|j=="ESP" |j=="PRT" |j=="GRC"  |j=="AUT" |j=="SWE" |j=="FIN")
	replace j = "EU15" if j == "DEU"
	drop eu15 v_eu15

	fillin i j t  // To balance the panel and generate zero trade observations we fillin
	merge m:1 i t using ext_margin, nogen 
	drop 	_fillin
	renvars v_n q_n / v q 
	replace v = 0 if v == .
	replace q = 0 if q == .
	
* BACI has Belgium-Luxembourg instead of Belgium & Luxembourg
	replace i = "BEL" if i == "BLX"	
	replace j = "BEL" if j == "BLX"
	order i j t
	sort  i j t
	save "$translog_gvty/processed/baci96", replace 
	
* 2. NTM Data (Ghodsi et al)
* ==========================
  use "$ntm/accu_nwld_95_17.dta", clear
  gen hs2 = substr(Product,1,2) 
  destring hs2, force replace
  drop if hs2 > 24
  renvars ImporterISO3 ExporterISO3 Year / j i t
  keep i j t SPSSTC
foreach c of varlist SPSSTC {
  bys i j t: egen  `c'_n = total(`c')
}
  drop SPSSTC
  rename SPSSTC_n sps
  duplicates drop
  replace j = "EU15" if j == "DEU"
  drop if (j=="FRA" |j=="ITA" |j=="NLD" |j=="BEL"| j=="BLX" |j=="LUX" |j=="GBR" |j=="DNK" |j=="IRL"|j=="ESP" |j=="PRT" |j=="GRC"  |j=="AUT" |j=="SWE" |j=="FIN")
save "$translog_gvty/processed/ntm", replace 

* 3. Tariffs
* ==========
  import delimited tariff_wto, varnames(1) clear
  keep reportername partnername tariffyear dutytype simpleaverage weightedaverage
  renvars reportername partnername / imp exp 
  do countrynames
  renvars imp exp tariffyear simpleaverage / j i t tariff
  replace j = "EU15" if j == "European Union"
  drop if (j=="FRA" |j=="ITA" |j=="NLD" |j=="BEL"| j=="BLX" |j=="LUX" |j=="GBR" |j=="DNK" |j=="IRL"|j=="ESP" |j=="PRT" |j=="GRC"  |j=="AUT" |j=="SWE" |j=="FIN" |j=="DEU")
  save "$translog_gvty/processed/tariff_wb", replace
  
  keep if j == "EU15"
  expgen = 13 if j == "EU15", copy(EU)
  replace j = "HUN" if EU == 1  & t >= 2004
  replace j = "LVA" if EU == 2  & t >= 2004
  replace j = "LTU" if EU == 3  & t >= 2004
  replace j = "POL" if EU == 4  & t >= 2004
  replace j = "SVK" if EU == 5  & t >= 2004
  replace j = "SVN" if EU == 6  & t >= 2004
  replace j = "MLT" if EU == 7  & t >= 2004
  replace j = "CZE" if EU == 8  & t >= 2004
  replace j = "EST" if EU == 9  & t >= 2004
  replace j = "CYP" if EU == 10 & t >= 2004
  replace j = "BGR" if EU == 11 & t >= 2007
  replace j = "ROU" if EU == 12 & t >= 2007
  replace j = "HRV" if EU == 13 & t >= 2013
  drop if EU == 1  & t < 2004
  drop if EU == 2  & t < 2004
  drop if EU == 3  & t < 2004
  drop if EU == 4  & t < 2004
  drop if EU == 5  & t < 2004
  drop if EU == 6  & t < 2004
  drop if EU == 7  & t < 2004
  drop if EU == 8  & t < 2004
  drop if EU == 9  & t < 2004
  drop if EU == 10 & t < 2004
  drop if EU == 11 & t < 2007
  drop if EU == 12 & t < 2007
  drop if EU == 13 & t < 2014
  append using 	"$translog_gvty/processed/tariff_wb"
  drop EU 
  encode dutytype, gen(duty) 
  drop dutytype 
  reshape wide tariff weightedaverage, j(duty) i(i j t)
  renvars tariff1 tariff2 tariff3 / tariff_ahs tariff_bnd tariff_mfn 
  drop weight*
  save "$translog_gvty/processed/tariff_wb", replace
  
* CEPII dataset
* =============
use "$translog_gvty/data/cepii", clear
foreach i of varlist colony contig comlangoff{
  bys i : egen new`i' = sum(`i') if (j=="FRA" |j=="ITA" |j=="NLD" |j=="BEL"| j=="BLX" |j=="LUX" |j=="GBR" |j=="DNK" |j=="IRL"|j=="ESP" |j=="PRT" |j=="GRC"  |j=="AUT" |j=="SWE" |j=="FIN" |j=="DEU")
}
  
  replace j = "EU15" if j == "NLD"
  drop if (j=="FRA" |j=="ITA" |j=="DEU" |j=="BEL"| j=="BLX" |j=="LUX" |j=="GBR" |j=="DNK" |j=="IRL"|j=="ESP" |j=="PRT" |j=="GRC"  |j=="AUT" |j=="SWE" |j=="FIN")
foreach i of varlist new*{
  replace `i' = 1 if `i' > 1
}
  replace colony     = newcolony     if j == "EU15"
  replace contig     = newcontig     if j == "EU15"
  replace comlangoff = newcomlangoff if j == "EU15"
  drop new* 
  save "$translog_gvty/processed/cepii_eu", replace
  
* Income class
clear
 import excel "income_class.xlsx", sheet("List of economies") cellrange(C5:L295) firstrow
 drop in 1
 keep Code Incomegroup
 drop if Incomegroup == ""
 rename Code j 
 replace j = "EU15" if j == "DEU"
 save "$translog_gvty/processed/income_j", replace 
 
 rename j i 
 replace i = "DEU" if i == "EU15"
 replace i = "BLX" if i == "LUX"
 replace i = "PSE" if i == "PAL"
 save "$translog_gvty/processed/income_i", replace 

* Merging the datasets
cd "$translog_gvty/processed"
  use baci96, clear
  merge 1:m i j t using ntm, 	  keep(1 3) nogen 
  merge m:1 i j   using cepii_eu, keep(1 3) nogen 
  *merge m:1 j t   using ntm_j,  keep(1 3) nogen 
  merge 1:m i j t using "$translog_gvty/data/rta",    keep(1 3)   nogen 
  merge 1:m i j t using tariff_wb, keep(1 3) nogen 
  gen int tariff = .  
  replace tariff = tariff_ahs
  replace tariff = tariff_mfn if tariff == .
  replace tariff = tariff_bnd if tariff == . 
  gen byte eu24i = (i=="CYP" & t>= 2004|i=="CZE" & t >= 2004|i=="EST" & t >= 2004|i=="HUN" & t >= 2004|i=="LVA" & t >= 2004 |i=="LTU" & t >= 2004|i=="MLT" & t >= 2004|i=="POL" & t>= 2004|i=="SVN" & t >= 2004|i=="SVK" & t >= 2004|i=="BGR" & t >= 2007|i=="ROU" & t >= 2007|i=="HRV" & t >= 2013)
  gen byte eu24j = (j=="CYP" & t>= 2004|j=="CZE" & t >= 2004|j=="EST" & t >= 2004|j=="HUN" & t >= 2004|j=="LVA" & t >= 2004 |j=="LTU" & t >= 2004|j=="MLT" & t >= 2004|j=="POL" & t>= 2004|j=="SVN" & t >= 2004|j=="SVK" & t >= 2004|j=="BGR" & t >= 2007|j=="ROU" & t >= 2007|j=="HRV" & t >= 2013)
  gen byte eu15i = (i=="FRA" |i=="ITA" |i=="NLD" |i=="BEL"| i=="BLX" |i=="LUX" |i=="GBR" |i=="DNK" |j=="IRL"|i=="ESP" |i=="PRT" |i=="GRC"  |i=="AUT" |i=="SWE" |i=="FIN" |i=="DEU")
  replace tariff = 0 if j == "EU15" & eu24i == 1
  replace tariff = 0 if eu15 == 1 & eu24j == 1
  replace tariff = 0 if j == "EU15" & eu15 == 1
  bys i j (t) : carryforward(tariff), replace
  bys i t: egen new = max(tariff)  
  replace tariff = new if tariff == . 
  drop if (i=="SDN"|i=="YUG")
  *drop if j == "EU15" & eu15i == 1 // Drop intra EU trade? 
  keep i j t v q n2_it n6_it sps rta tariff colony comlangoff contig distcap // partnername reportername
foreach i of varlist sps v {
  replace `i' = 0 if `i' == .
}
  compress 
  save translog_gravity, replace 
